rm(list = ls())

## 

library(tm)
library(tokenizers)
library(stopwords)
library(tidytext)
library(lexicon)
library(tidyverse)

## Get scraped data

tenders <- read_rds('data/data_tenders_raw.rds')

## Remove gemeinde or stadt

tenders <- tenders %>% 
  filter(str_detect(institution, 'Stadt|Gemeinde'))

## All institution terms 

inst_terms <- tenders %>% 
  pull(institution) %>% 
  str_split(' ') %>% 
  unlist() %>% 
  str_remove(',') %>% 
  .[!. == ''] %>% 
  .[nchar(.) > 4] %>% 
  table() %>% 
  sort(decreasing = T)

## Some more cleanup

title_terms <- tenders %>% 
  dplyr::select(title) %>% 
  mutate(title = str_replace_all(title, "[[:punct:]]", ' '),
         title = tolower(title),
         title = tm::removeNumbers(title)) %>% 
  mutate(title = str_squish(title)) %>% 
  mutate(title = tm::removePunctuation(title)) %>% 
  mutate(title = tm::removeWords(title, paste0(' ',
                                               stopwords::stopwords('german'),
                                               ''))) %>% 
  mutate(title = str_squish(title)) %>% 
  mutate(title = gsub('\\s\\w{1,2}\\s','',title)) %>% 
  mutate(title = gsub('[^[:alnum:] ]','',title)) %>% 
  mutate(title = str_replace_all(title, 'kita|kindertagesstätte', 'kindergarten'))

## Custom remove words

remwords <- c("eines", 
  "einer",  
  "stadt", "rhein", "salzbergen",
  "zwei", "lauta" , "laubusch", "neutraubling",
  "markt", "los", "los", "unter", "int",
  "questenberg", "nordheim", "käppele",
  "stralsund", "kirchheim", "teck",
  "ludwigslust", "jütrichau",
  "gemeinde", "mainhardt", "str",
  "aße") %>% 
  paste0(collapse = '|')

## Remove and some cleanup

title_terms <- title_terms %>% 
  mutate(title = str_remove_all(title, remwords)) %>% 
  mutate(title = str_squish(title)) %>% 
  mutate(title = gsub('\\s\\w{1,2}\\s','',title)) %>% 
  mutate(title = str_squish(title))

## Get most common terms

unigrams <-  title_terms %>% 
  unnest_ngrams(output = 'words', input = 'title', n = 1) %>% 
  group_by(words) %>% 
  summarise(n = n()) %>% 
  arrange(desc(n)) %>% 
  mutate(freq = round(n * 100/ nrow(title_terms), 2)) %>% 
  mutate(translation = '')

## These are in German -> load file with translations

## Load

terms <- read_excel('data/data_tenders_translations.xlsx') %>% 
  mutate(words = str_to_title(words)) %>% 
  dplyr::select(words, translation, n, freq)

# Table A.12: Most frequent terms in municipal tenders ----

out <- kable(terms[1:20, ], 
             format = 'latex', 
             col.names = c('Term', 'Translation', 
                           'No. of mentions',
                           'Rel. freq. of mentions (%)'), 
             caption = 'Most frequent terms in municipal tenders',
             booktabs = T, linesep = "",
             label = 'tenders') %>% 
  kable_styling() %>%
  column_spec(2, width = "6cm") %>% 
  column_spec(3, width = "3cm") %>% 
  column_spec(4, width = "3cm")
out

  
